Generating independent data with similar marginal distributions

We are generating a new random data based on the frequencies of the original features. This way we get a new random data with the same features. The marginal distribution of a selected feature will be the same as in the original data, but the features in the simulated data will be independent from each other.

In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
In [2]:
data = pd.read_excel("./data/clean/data_all_clean.xlsx")
data.columns
Out[2]:
Index(['Unnamed: 0', 'ID', 'Képzés', 'Felvétel_féléve',
       'felvi_osszes_skalazott', 'Hozott/Tanulmányi_pontszám',
       'Szerzett/Érettségi_pontsz', 'Statusz_vegzett',
       'Oklevél_eredménye_számmal', 'Felvételkori_pénzü_státusz', 'regolya',
       'regolya.1', 'idegen_nyelv', 'magyar nyelv és irodalom', 'matematika',
       'történelem', 'választott_tárgy', 'Hallgató_ID', 'idegen_nyelv_ks',
       'magyar_irodalom_ks', 'magyar_nyelvtan_ks', 'matematika_ks',
       'történelem_ks', 'választható_tantárgy_ks', 'nyv_pontok', 'nem_ferfi',
       'erettsegi_ota_eltelt_évek', 'ks_helye', 'verseny'],
      dtype='object')
In [3]:
data.drop(['Unnamed: 0', 'ID', 'Hallgató_ID'], axis=1, inplace=True)
In [6]:
profile = data.profile_report(title='Report on the features of the original data')
profile.to_file(output_file="report_on_features_orig.html")
In [7]:
sim=pd.DataFrame(columns=data.columns)
In [8]:
for i in list(sim):
    s = data[i].value_counts()/data[i].notna().sum()
    sim[i] = np.random.choice(s.index.tolist(), 10000, p=s)
In [9]:
sim.profile_report(style={"full_width":True})
Out[9]:

In [10]:
profile_sim = sim.profile_report(title='Report on the features of the simulated data')
profile_sim.to_file(output_file="report_on_features_simul.html")

Encoding the simulated data with numbers. Later, some variables will encoded with one-hot encoding.

In [11]:
sim_enc = sim.apply(LabelEncoder().fit_transform)
In [12]:
features = set(sim.columns)
features.remove('Statusz_vegzett')

X_train, X_test, y_train, y_test = train_test_split(sim[features], sim['Statusz_vegzett'], test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
In [13]:
l=[X_train, X_val, X_test, y_train, y_val, y_test]
In [14]:
for df, name in zip(l, ["X_train", "X_val", "X_test", "y_train", "y_val", "y_test"]):
    print(df.shape, type(df))
    df.to_csv(name+".csv", header=True, sep=";")
(6400, 25) <class 'pandas.core.frame.DataFrame'>
(1600, 25) <class 'pandas.core.frame.DataFrame'>
(2000, 25) <class 'pandas.core.frame.DataFrame'>
(6400,) <class 'pandas.core.series.Series'>
(1600,) <class 'pandas.core.series.Series'>
(2000,) <class 'pandas.core.series.Series'>